The goal of this project is to utilize statistical matching techniques to search for a subset of the prerelease user population that is most representative of Release.
In this first report, we will perform an exploratory analysis of the data, focusing on investigating the differences between Beta and Release users.
## Loading the training dataset
load("~/GitHub/ff-beta-release-matching/poc/EDA/data_milestone2_df_train_validate_20191025.RData")## View train dataframe
kable(head(df_train_f)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| client_id | num_active_days | content_crashes | active_hours | uri_count | session_length | search_count | num_bookmarks | num_pages | daily_unique_domains | daily_max_tabs | daily_tabs_opened | startup_ms | daily_num_sessions_started | active_hours_max | uri_count_max | session_length_max | search_count_max | num_pages_max | daily_unique_domains_max | daily_max_tabs_max | daily_tabs_opened_max | startup_ms_max | daily_num_sessions_started_max | label | install_year | profile_age | fxa_configured | sync_configured | is_default_browser | locale | normalized_channel | app_version | default_search_engine | country | timezone_offset | num_addons | cpu_cores | cpu_speed_mhz | cpu_l2_cache_kb | cpu_vendor | memory_mb | os_version | is_wow64 | FX_PAGE_LOAD_MS_2_PARENT | TIME_TO_DOM_COMPLETE_MS | TIME_TO_DOM_CONTENT_LOADED_END_MS | TIME_TO_LOAD_EVENT_END_MS | TIME_TO_DOM_INTERACTIVE_MS | TIME_TO_NON_BLANK_PAINT_MS | profile_age_cat | distro_id_norm | timezone_cat | memory_cat | cpu_speed_cat | cpu_cores_cat | is_release | cpu_l2_cache_kb_cat |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 001cf926-92e3-4587-887e-d3156ba24d82 | 8 | 0 | 1.4215278 | 76.1250 | 22.9337499 | 1.875 | 11.00 | 4014.5 | 6.062500 | 7.00000 | 15.00 | 54176.2500 | 0.6250000 | 3.3208333 | 139 | 37.007500 | 6 | 15464 | 17.000000 | 11 | 29 | 180095 | 2 | beta | 2016 | 1160 | False | False | True | en-US | beta | 67 | DuckDuckGo | US | -240 | 8 | 2 | 2527 | 256 | Intel | 4022 | 6.1 | False | 4223.089 | 5220.036 | 9079.136 | 5221.752 | 6157.840 | 5198.1300 | < 5 years | Mozilla | (-6,-4] | < 4GB | < 3GHz | 2 | FALSE | < 256 |
| 00210163-2123-427e-bb73-398bda9f9eba | 5 | 0 | 0.8305556 | 168.2000 | 2.4390556 | 0.800 | 248.75 | 20599.5 | 7.866667 | 3.40000 | 14.80 | 3164.0667 | 1.2000000 | 1.6708333 | 325 | 8.318333 | 3 | 20719 | 17.000000 | 4 | 33 | 4966 | 2 | beta | 2016 | 1079 | False | False | False | en-US | beta | 67 | DuckDuckGo | GB | 60 | 6 | 2 | 2394 | 256 | Intel | 3810 | 6.1 | False | 2148.350 | 2253.526 | 1159.979 | 2146.827 | 1155.050 | 1015.9784 | < 5 years | Mozilla | (0,2] | < 4GB | < 3GHz | 2 | FALSE | < 256 |
| 0024fd24-4ef5-4771-850a-9e3846597015 | 2 | 0 | 0.5111111 | 82.0000 | 0.8712505 | 2.500 | 9.00 | 87.0 | 2.166667 | 4.00000 | 8.00 | 23977.9444 | 5.0000000 | 0.8250000 | 145 | 1.464445 | 5 | 87 | 3.333333 | 6 | 15 | 31918 | 9 | beta | 2019 | 745 | False | False | True | en-US | beta | 67 | GB | 60 | 6 | 4 | 2394 | 256 | Intel | 8124 | 10.0 | False | 2699.834 | 2216.994 | 1832.642 | 2119.277 | 1819.612 | 1979.7910 | < 5 years | Mozilla | (0,2] | < 16GB | < 3GHz | < 4 | FALSE | < 256 | |
| 004f70f7-2576-4de5-94b4-5bf1acdca0a8 | 8 | 0 | 0.3946181 | 101.8750 | 4.6785415 | 7.750 | 87.00 | 8882.0 | 7.750000 | 9.25000 | 19.50 | 1703.3125 | 1.3750000 | 1.1625000 | 210 | 9.174722 | 10 | 9044 | 13.000000 | 12 | 37 | 3454 | 2 | beta | 2018 | 130 | True | True | True | en-US | beta | 67 | US | -240 | 10 | 4 | 3991 | 256 | Intel | 16235 | 10.0 | False | 2370.563 | 2368.195 | 1614.073 | 2356.307 | 1497.652 | 850.0551 | < 6 months | Mozilla | (-6,-4] | < 16GB | < 4GHz | < 4 | FALSE | < 256 | |
| 007c0c11-38e4-476b-a494-d732e15ac159 | 4 | 0 | 0.5930556 | 167.5000 | 5.6081245 | 0.250 | 13.00 | 3024.0 | 1.775000 | 8.75000 | 13.75 | 15285.5375 | 2.2500000 | 0.8791667 | 255 | 9.059722 | 1 | 3552 | 2.500000 | 27 | 36 | 24592 | 3 | beta | 2018 | 293 | False | False | False | en-US | beta | 67 | US | 360 | 7 | 2 | 2659 | 3072 | Intel | 3317 | 10.0 | False | 4050.417 | 5106.575 | 3630.348 | 5041.016 | 3618.008 | 2006.3376 | < 2 years | Mozilla | (4,6] | < 4GB | < 3GHz | 2 | FALSE | > 1024 | |
| 0294837f-c98f-44ab-8237-30d2eba6c55a | 6 | 0 | 1.6333333 | 165.8333 | 32.4127778 | 5.500 | 17.00 | 7740.7 | 8.583333 | 10.33333 | 20.00 | 975.2222 | 0.1666667 | 3.2319444 | 323 | 41.553611 | 12 | 7975 | 13.000000 | 18 | 35 | 1088 | 1 | beta | 2019 | 502 | False | False | True | en-US | beta | 67 | other (non-bundled) | GB | 60 | 7 | 4 | 1800 | 256 | Intel | 8026 | 10.0 | False | 2740.793 | 2170.362 | 1482.620 | 2033.104 | 1161.901 | 1034.0580 | < 2 years | Mozilla | (0,2] | < 16GB | < 2GHz | < 4 | FALSE | < 256 |
## View train dataframe
kable(head(df_validate_f)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| client_id | num_active_days | content_crashes | active_hours | uri_count | session_length | search_count | num_bookmarks | num_pages | daily_unique_domains | daily_max_tabs | daily_tabs_opened | startup_ms | daily_num_sessions_started | active_hours_max | uri_count_max | session_length_max | search_count_max | num_pages_max | daily_unique_domains_max | daily_max_tabs_max | daily_tabs_opened_max | startup_ms_max | daily_num_sessions_started_max | label | install_year | profile_age | fxa_configured | sync_configured | is_default_browser | locale | normalized_channel | app_version | default_search_engine | country | timezone_offset | num_addons | cpu_cores | cpu_speed_mhz | cpu_l2_cache_kb | cpu_vendor | memory_mb | os_version | is_wow64 | FX_PAGE_LOAD_MS_2_PARENT | TIME_TO_DOM_COMPLETE_MS | TIME_TO_DOM_CONTENT_LOADED_END_MS | TIME_TO_LOAD_EVENT_END_MS | TIME_TO_DOM_INTERACTIVE_MS | TIME_TO_NON_BLANK_PAINT_MS | profile_age_cat | distro_id_norm | timezone_cat | memory_cat | cpu_speed_cat | cpu_cores_cat | is_release | cpu_l2_cache_kb_cat |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 001cf926-92e3-4587-887e-d3156ba24d82 | 8 | 0 | 1.5369792 | 71.12500 | 23.6797916 | 3.375000 | 11 | 1890.571 | 9.216667 | 8.500000 | 16.625 | 9928.483 | 0.6250000 | 2.3250000 | 120 | 34.020000 | 6 | 2094 | 20 | 14 | 31 | 17491.667 | 3 | beta | 2016 | 1204 | False | False | True | en-US | beta | 68 | DuckDuckGo | US | -240 | 7 | 2 | 2527 | 256 | Intel | 4022 | 6.1 | False | 3133.947 | 3713.308 | 3748.471 | 3730.944 | 2444.799 | 1972.6632 | < 5 years | Mozilla | (-6,-4] | < 4GB | < 3GHz | 2 | FALSE | < 256 |
| 00210163-2123-427e-bb73-398bda9f9eba | 2 | 0 | 0.1833333 | 43.50000 | 0.5619445 | 1.500000 | 259 | 22005.000 | 5.000000 | 3.000000 | 6.000 | 5413.500 | 1.0000000 | 0.2152778 | 48 | 0.781111 | 2 | 22005 | 7 | 4 | 7 | 9579.000 | 1 | beta | 2016 | 1124 | False | False | False | en-US | beta | 68 | DuckDuckGo | GB | 60 | 5 | 2 | 2394 | 256 | Intel | 3810 | 6.1 | False | 3226.048 | 2561.596 | 1346.836 | 2523.810 | 1385.350 | 935.6222 | < 5 years | Mozilla | (0,2] | < 4GB | < 3GHz | 2 | FALSE | < 256 |
| 007c0c11-38e4-476b-a494-d732e15ac159 | 2 | 0 | 0.2423611 | 89.00000 | 6.6299995 | 0.000000 | 15 | 7203.000 | 1.000000 | 6.500000 | 9.500 | 7041.667 | 1.0000000 | 0.3222222 | 99 | 9.959722 | 0 | 7203 | 1 | 11 | 12 | 9194.333 | 2 | beta | 2018 | 336 | False | False | False | en-US | beta | 68 | US | 360 | 6 | 2 | 2659 | 3072 | Intel | 3317 | 10.0 | False | 4400.155 | 7244.930 | 3711.445 | 7280.457 | 3929.289 | 1474.5607 | < 2 years | Mozilla | (4,6] | < 4GB | < 3GHz | 2 | FALSE | > 1024 | |
| 009ca4e9-874a-4c3e-983d-af0923346efb | 3 | 0 | 0.1365741 | 29.66667 | 0.2358330 | 0.000000 | 11 | 2112.000 | 1.666667 | 1.666667 | 1.000 | 4644.333 | 1.6666667 | 0.1958333 | 53 | 0.387222 | 0 | 2112 | 2 | 2 | 1 | 6964.500 | 2 | beta | 2013 | 1606 | False | False | True | en-US | beta | 68 | GB | 360 | 6 | 2 | 2594 | 256 | Intel | 3965 | 6.2 | False | 5909.683 | 11043.408 | 5398.452 | 10410.288 | 5860.450 | 4538.1379 | < 5 years | Mozilla | (4,6] | < 4GB | < 3GHz | 2 | FALSE | < 256 | |
| 0101d568-0c63-4492-9295-ed57ef78207f | 3 | 0 | 0.3435185 | 15.66667 | 24.0802773 | 0.000000 | 7 | 17.000 | 1.000000 | 1.666667 | 1.500 | 4214.833 | 0.3333333 | 0.5083333 | 19 | 39.200277 | 0 | 18 | 1 | 2 | 2 | 4215.000 | 1 | beta | 2017 | 6 | False | False | False | en-US | beta | 68 | US | -420 | 5 | 4 | 3093 | 256 | Intel | 16274 | 6.3 | True | 2306.655 | 3225.400 | 2761.783 | 3493.289 | 2141.391 | 2823.6923 | < 1 week | Mozilla | (-8,-6] | < 16GB | < 4GHz | < 4 | FALSE | < 256 | |
| 0159675e-15b0-4443-85b1-94de65455636 | 6 | 0 | 0.0946759 | 18.50000 | 10.6411113 | 1.166667 | 7 | 115.500 | 2.666667 | 3.166667 | 4.000 | 2427.250 | 1.1666667 | 0.1944444 | 33 | 29.938334 | 2 | 137 | 5 | 6 | 7 | 5234.000 | 2 | beta | 2019 | 17 | False | False | False | en-US | beta | 68 | US | -240 | 5 | 4 | 3292 | 256 | Intel | 8098 | 10.0 | True | 5100.212 | 6103.080 | 3665.146 | 6118.852 | 3707.192 | 3391.5286 | < 1 month | Mozilla | (-6,-4] | < 16GB | < 4GHz | < 4 | FALSE | < 256 |
To get introduced to our training dataset, let’s have a look on the basic information of the dataset.
| rows | columns | discrete_columns | continuous_columns | all_missing_columns | total_missing_values | complete_rows | total_observations | memory_usage |
|---|---|---|---|---|---|---|---|---|
| 302819 | 58 | 20 | 38 | 0 | 0 | 302819 | 17563502 | 135686992 |
To get introduced to our validation dataset, let’s have a look on the basic information of the dataset.
| rows | columns | discrete_columns | continuous_columns | all_missing_columns | total_missing_values | complete_rows | total_observations | memory_usage |
|---|---|---|---|---|---|---|---|---|
| 328042 | 58 | 20 | 38 | 0 | 0 | 328042 | 19026436 | 146987912 |
Let’s use glimpse function to display a vertical preview of the training dataset. So we can easily preview data type and sample data.
glimpse(df_train_f)## Observations: 302,819
## Variables: 58
## $ client_id <chr> "001cf926-92e3-4587-887e-d31...
## $ num_active_days <int> 8, 5, 2, 8, 4, 6, 8, 4, 3, 5...
## $ content_crashes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ active_hours <dbl> 1.42152778, 0.83055556, 0.51...
## $ uri_count <dbl> 76.12500, 168.20000, 82.0000...
## $ session_length <dbl> 22.93374988, 2.43905560, 0.8...
## $ search_count <dbl> 1.875000, 0.800000, 2.500000...
## $ num_bookmarks <dbl> 11.00, 248.75, 9.00, 87.00, ...
## $ num_pages <dbl> 4014.5000, 20599.5000, 87.00...
## $ daily_unique_domains <dbl> 6.062500, 7.866667, 2.166667...
## $ daily_max_tabs <dbl> 7.000000, 3.400000, 4.000000...
## $ daily_tabs_opened <dbl> 15.000000, 14.800000, 8.0000...
## $ startup_ms <dbl> 54176.2500, 3164.0667, 23977...
## $ daily_num_sessions_started <dbl> 0.6250000, 1.2000000, 5.0000...
## $ active_hours_max <dbl> 3.3208333, 1.6708333, 0.8250...
## $ uri_count_max <int> 139, 325, 145, 210, 255, 323...
## $ session_length_max <dbl> 37.007500, 8.318333, 1.46444...
## $ search_count_max <int> 6, 3, 5, 10, 1, 12, 29, 0, 0...
## $ num_pages_max <dbl> 15464.0, 20719.0, 87.0, 9044...
## $ daily_unique_domains_max <dbl> 17.000000, 17.000000, 3.3333...
## $ daily_max_tabs_max <int> 11, 4, 6, 12, 27, 18, 18, 2,...
## $ daily_tabs_opened_max <int> 29, 33, 15, 37, 36, 35, 170,...
## $ startup_ms_max <dbl> 180095.000, 4966.000, 31918....
## $ daily_num_sessions_started_max <int> 2, 2, 9, 2, 3, 1, 6, 1, 2, 3...
## $ label <fct> beta, beta, beta, beta, beta...
## $ install_year <dbl> 2016, 2016, 2019, 2018, 2018...
## $ profile_age <dbl> 1160, 1079, 745, 130, 293, 5...
## $ fxa_configured <fct> False, False, False, True, F...
## $ sync_configured <fct> False, False, False, True, F...
## $ is_default_browser <fct> True, False, True, True, Fal...
## $ locale <fct> en-US, en-US, en-US, en-US, ...
## $ normalized_channel <fct> beta, beta, beta, beta, beta...
## $ app_version <dbl> 67, 67, 67, 67, 67, 67, 67, ...
## $ default_search_engine <fct> DuckDuckGo, DuckDuckGo, Goog...
## $ country <fct> US, GB, GB, US, US, GB, GB, ...
## $ timezone_offset <int> -240, 60, 60, -240, 360, 60,...
## $ num_addons <dbl> 8.00, 6.00, 6.00, 10.00, 7.0...
## $ cpu_cores <dbl> 2, 2, 4, 4, 2, 4, 2, 2, 2, 2...
## $ cpu_speed_mhz <dbl> 2527, 2394, 2394, 3991, 2659...
## $ cpu_l2_cache_kb <dbl> 256, 256, 256, 256, 3072, 25...
## $ cpu_vendor <fct> Intel, Intel, Intel, Intel, ...
## $ memory_mb <int> 4022, 3810, 8124, 16235, 331...
## $ os_version <ord> 6.1, 6.1, 10.0, 10.0, 10.0, ...
## $ is_wow64 <fct> False, False, False, False, ...
## $ FX_PAGE_LOAD_MS_2_PARENT <dbl> 4223.0885, 2148.3495, 2699.8...
## $ TIME_TO_DOM_COMPLETE_MS <dbl> 5220.036, 2253.526, 2216.994...
## $ TIME_TO_DOM_CONTENT_LOADED_END_MS <dbl> 9079.1364, 1159.9791, 1832.6...
## $ TIME_TO_LOAD_EVENT_END_MS <dbl> 5221.7525, 2146.8265, 2119.2...
## $ TIME_TO_DOM_INTERACTIVE_MS <dbl> 6157.8399, 1155.0503, 1819.6...
## $ TIME_TO_NON_BLANK_PAINT_MS <dbl> 5198.1300, 1015.9784, 1979.7...
## $ profile_age_cat <ord> < 5 years, < 5 years, < 5 ye...
## $ distro_id_norm <fct> Mozilla, Mozilla, Mozilla, M...
## $ timezone_cat <fct> "(-6,-4]", "(0,2]", "(0,2]",...
## $ memory_cat <ord> < 4GB, < 4GB, < 16GB, < 16GB...
## $ cpu_speed_cat <ord> < 3GHz, < 3GHz, < 3GHz, < 4G...
## $ cpu_cores_cat <ord> 2, 2, < 4, < 4, 2, < 4, 2, 2...
## $ is_release <lgl> FALSE, FALSE, FALSE, FALSE, ...
## $ cpu_l2_cache_kb_cat <fct> < 256, < 256, < 256, < 256, ...
If we want to get some metrics about data types, zeros, infinite numbers, and missing values, we can use the df_status function.
kable(df_status(df_train_f, FALSE)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| variable | q_zeros | p_zeros | q_na | p_na | q_inf | p_inf | type | unique |
|---|---|---|---|---|---|---|---|---|
| client_id | 0 | 0.00 | 0 | 0 | 0 | 0 | character | 302805 |
| num_active_days | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 8 |
| content_crashes | 302819 | 100.00 | 0 | 0 | 0 | 0 | integer | 1 |
| active_hours | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 180711 |
| uri_count | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 18610 |
| session_length | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 289472 |
| search_count | 80328 | 26.53 | 0 | 0 | 0 | 0 | numeric | 930 |
| num_bookmarks | 472 | 0.16 | 0 | 0 | 0 | 0 | numeric | 19727 |
| num_pages | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 179391 |
| daily_unique_domains | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 66058 |
| daily_max_tabs | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 2888 |
| daily_tabs_opened | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 4645 |
| startup_ms | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 276270 |
| daily_num_sessions_started | 3100 | 1.02 | 0 | 0 | 0 | 0 | numeric | 654 |
| active_hours_max | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 39216 |
| uri_count_max | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 3642 |
| session_length_max | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 188078 |
| search_count_max | 80328 | 26.53 | 0 | 0 | 0 | 0 | integer | 140 |
| num_pages_max | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 76011 |
| daily_unique_domains_max | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 1996 |
| daily_max_tabs_max | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 541 |
| daily_tabs_opened_max | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 853 |
| startup_ms_max | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 145192 |
| daily_num_sessions_started_max | 3100 | 1.02 | 0 | 0 | 0 | 0 | integer | 85 |
| label | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| install_year | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 21 |
| profile_age | 6320 | 2.09 | 0 | 0 | 0 | 0 | numeric | 4127 |
| fxa_configured | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| sync_configured | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| is_default_browser | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| locale | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| normalized_channel | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| app_version | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 1 |
| default_search_engine | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 6 |
| country | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| timezone_offset | 736 | 0.24 | 0 | 0 | 0 | 0 | integer | 35 |
| num_addons | 53 | 0.02 | 0 | 0 | 0 | 0 | numeric | 2124 |
| cpu_cores | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 27 |
| cpu_speed_mhz | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 1232 |
| cpu_l2_cache_kb | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 8 |
| cpu_vendor | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 3 |
| memory_mb | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 5893 |
| os_version | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 5 |
| is_wow64 | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| FX_PAGE_LOAD_MS_2_PARENT | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 294778 |
| TIME_TO_DOM_COMPLETE_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 300964 |
| TIME_TO_DOM_CONTENT_LOADED_END_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 300503 |
| TIME_TO_LOAD_EVENT_END_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 301045 |
| TIME_TO_DOM_INTERACTIVE_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 300036 |
| TIME_TO_NON_BLANK_PAINT_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 296802 |
| profile_age_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 6 |
| distro_id_norm | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 4 |
| timezone_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 13 |
| memory_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 6 |
| cpu_speed_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 5 |
| cpu_cores_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 6 |
| is_release | 59627 | 19.69 | 0 | 0 | 0 | 0 | logical | 2 |
| cpu_l2_cache_kb_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 4 |
q_zeros: quantity of zeros (p_zeros: in percent)q_inf: quantity of infinite values (p_inf: in percent)q_na: quantity of NA (p_na: in percent)type: factor, ordered-factor, numeric, integer or characterunique: quantity of unique valuesLet’s use glimpse function to display a vertical preview of the validation dataset.
glimpse(df_validate_f)## Observations: 328,042
## Variables: 58
## $ client_id <chr> "001cf926-92e3-4587-887e-d31...
## $ num_active_days <int> 8, 2, 2, 3, 3, 6, 1, 4, 7, 4...
## $ content_crashes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ active_hours <dbl> 1.53697917, 0.18333333, 0.24...
## $ uri_count <dbl> 71.125000, 43.500000, 89.000...
## $ session_length <dbl> 23.6797916, 0.5619445, 6.629...
## $ search_count <dbl> 3.3750000, 1.5000000, 0.0000...
## $ num_bookmarks <dbl> 11.0, 259.0, 15.0, 11.0, 7.0...
## $ num_pages <dbl> 1890.5714, 22005.0000, 7203....
## $ daily_unique_domains <dbl> 9.216667, 5.000000, 1.000000...
## $ daily_max_tabs <dbl> 8.500000, 3.000000, 6.500000...
## $ daily_tabs_opened <dbl> 16.625, 6.000, 9.500, 1.000,...
## $ startup_ms <dbl> 9928.4833, 5413.5000, 7041.6...
## $ daily_num_sessions_started <dbl> 0.6250000, 1.0000000, 1.0000...
## $ active_hours_max <dbl> 2.32500000, 0.21527778, 0.32...
## $ uri_count_max <int> 120, 48, 99, 53, 19, 33, 32,...
## $ session_length_max <dbl> 34.020000, 0.781111, 9.95972...
## $ search_count_max <int> 6, 2, 0, 0, 0, 2, 0, 5, 1, 1...
## $ num_pages_max <dbl> 2094, 22005, 7203, 2112, 18,...
## $ daily_unique_domains_max <dbl> 20.000000, 7.000000, 1.00000...
## $ daily_max_tabs_max <int> 14, 4, 11, 2, 2, 6, 8, 3, 6,...
## $ daily_tabs_opened_max <int> 31, 7, 12, 1, 2, 7, 13, 71, ...
## $ startup_ms_max <dbl> 17491.667, 9579.000, 9194.33...
## $ daily_num_sessions_started_max <int> 3, 1, 2, 2, 1, 2, 2, 2, 2, 1...
## $ label <fct> beta, beta, beta, beta, beta...
## $ install_year <dbl> 2016, 2016, 2018, 2013, 2017...
## $ profile_age <dbl> 1204, 1124, 336, 1606, 6, 17...
## $ fxa_configured <fct> False, False, False, False, ...
## $ sync_configured <fct> False, False, False, False, ...
## $ is_default_browser <fct> True, False, False, True, Fa...
## $ locale <fct> en-US, en-US, en-US, en-US, ...
## $ normalized_channel <fct> beta, beta, beta, beta, beta...
## $ app_version <dbl> 68, 68, 68, 68, 68, 68, 68, ...
## $ default_search_engine <fct> DuckDuckGo, DuckDuckGo, Goog...
## $ country <fct> US, GB, US, GB, US, US, US, ...
## $ timezone_offset <int> -240, 60, 360, 360, -420, -2...
## $ num_addons <dbl> 7.0, 5.0, 6.0, 6.0, 5.0, 5.0...
## $ cpu_cores <dbl> 2, 2, 2, 2, 4, 4, 1, 2, 4, 3...
## $ cpu_speed_mhz <dbl> 2527, 2394, 2659, 2594, 3093...
## $ cpu_l2_cache_kb <dbl> 256, 256, 3072, 256, 256, 25...
## $ cpu_vendor <fct> Intel, Intel, Intel, Intel, ...
## $ memory_mb <int> 4022, 3810, 3317, 3965, 1627...
## $ os_version <ord> 6.1, 6.1, 10.0, 6.2, 6.3, 10...
## $ is_wow64 <fct> False, False, False, False, ...
## $ FX_PAGE_LOAD_MS_2_PARENT <dbl> 3133.947, 3226.048, 4400.155...
## $ TIME_TO_DOM_COMPLETE_MS <dbl> 3713.308, 2561.596, 7244.930...
## $ TIME_TO_DOM_CONTENT_LOADED_END_MS <dbl> 3748.4715, 1346.8361, 3711.4...
## $ TIME_TO_LOAD_EVENT_END_MS <dbl> 3730.944, 2523.810, 7280.457...
## $ TIME_TO_DOM_INTERACTIVE_MS <dbl> 2444.7985, 1385.3500, 3929.2...
## $ TIME_TO_NON_BLANK_PAINT_MS <dbl> 1972.6632, 935.6222, 1474.56...
## $ profile_age_cat <ord> < 5 years, < 5 years, < 2 ye...
## $ distro_id_norm <fct> Mozilla, Mozilla, Mozilla, M...
## $ timezone_cat <fct> "(-6,-4]", "(0,2]", "(4,6]",...
## $ memory_cat <ord> < 4GB, < 4GB, < 4GB, < 4GB, ...
## $ cpu_speed_cat <ord> < 3GHz, < 3GHz, < 3GHz, < 3G...
## $ cpu_cores_cat <ord> 2, 2, 2, 2, < 4, < 4, 1, 2, ...
## $ is_release <lgl> FALSE, FALSE, FALSE, FALSE, ...
## $ cpu_l2_cache_kb_cat <fct> < 256, < 256, > 1024, < 256,...
kable(df_status(df_train_f, FALSE)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| variable | q_zeros | p_zeros | q_na | p_na | q_inf | p_inf | type | unique |
|---|---|---|---|---|---|---|---|---|
| client_id | 0 | 0.00 | 0 | 0 | 0 | 0 | character | 302805 |
| num_active_days | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 8 |
| content_crashes | 302819 | 100.00 | 0 | 0 | 0 | 0 | integer | 1 |
| active_hours | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 180711 |
| uri_count | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 18610 |
| session_length | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 289472 |
| search_count | 80328 | 26.53 | 0 | 0 | 0 | 0 | numeric | 930 |
| num_bookmarks | 472 | 0.16 | 0 | 0 | 0 | 0 | numeric | 19727 |
| num_pages | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 179391 |
| daily_unique_domains | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 66058 |
| daily_max_tabs | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 2888 |
| daily_tabs_opened | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 4645 |
| startup_ms | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 276270 |
| daily_num_sessions_started | 3100 | 1.02 | 0 | 0 | 0 | 0 | numeric | 654 |
| active_hours_max | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 39216 |
| uri_count_max | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 3642 |
| session_length_max | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 188078 |
| search_count_max | 80328 | 26.53 | 0 | 0 | 0 | 0 | integer | 140 |
| num_pages_max | 7 | 0.00 | 0 | 0 | 0 | 0 | numeric | 76011 |
| daily_unique_domains_max | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 1996 |
| daily_max_tabs_max | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 541 |
| daily_tabs_opened_max | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 853 |
| startup_ms_max | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 145192 |
| daily_num_sessions_started_max | 3100 | 1.02 | 0 | 0 | 0 | 0 | integer | 85 |
| label | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| install_year | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 21 |
| profile_age | 6320 | 2.09 | 0 | 0 | 0 | 0 | numeric | 4127 |
| fxa_configured | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| sync_configured | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| is_default_browser | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| locale | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| normalized_channel | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| app_version | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 1 |
| default_search_engine | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 6 |
| country | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| timezone_offset | 736 | 0.24 | 0 | 0 | 0 | 0 | integer | 35 |
| num_addons | 53 | 0.02 | 0 | 0 | 0 | 0 | numeric | 2124 |
| cpu_cores | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 27 |
| cpu_speed_mhz | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 1232 |
| cpu_l2_cache_kb | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 8 |
| cpu_vendor | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 3 |
| memory_mb | 0 | 0.00 | 0 | 0 | 0 | 0 | integer | 5893 |
| os_version | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 5 |
| is_wow64 | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 2 |
| FX_PAGE_LOAD_MS_2_PARENT | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 294778 |
| TIME_TO_DOM_COMPLETE_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 300964 |
| TIME_TO_DOM_CONTENT_LOADED_END_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 300503 |
| TIME_TO_LOAD_EVENT_END_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 301045 |
| TIME_TO_DOM_INTERACTIVE_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 300036 |
| TIME_TO_NON_BLANK_PAINT_MS | 0 | 0.00 | 0 | 0 | 0 | 0 | numeric | 296802 |
| profile_age_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 6 |
| distro_id_norm | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 4 |
| timezone_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 13 |
| memory_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 6 |
| cpu_speed_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 5 |
| cpu_cores_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | ordered-factor | 6 |
| is_release | 59627 | 19.69 | 0 | 0 | 0 | 0 | logical | 2 |
| cpu_l2_cache_kb_cat | 0 | 0.00 | 0 | 0 | 0 | 0 | factor | 4 |
Are all the variables in the correct data type?
None. It seems that this has already been dealt with in preprocessing.
Any variables with lots of zeros?
Yes. Variables with lots of zeros may not be useful for modeling and, in some cases, they may dramatically bias the model. For example, the content_crashes is 100% equal to zero.
Any variables with lots of NAs?
None. Good news.
Any high cardinality variable?
Factor/categorical variables with a high number of different values (~30) tend to do overfitting if the categories have low cardinality.
## Training
df_release <- df_train_f[which(df_train_f$label == 'release'), ]
df_beta <- df_train_f[which(df_train_f$label == 'beta'), ]
## Validation
df_v_release <- df_validate_f[which(df_validate_f$label == 'release'), ]
df_v_beta <- df_validate_f[which(df_validate_f$label == 'beta'), ]f <- freq(df_train_f$label)f <- freq(df_validate_f$label)## Frequency distribution release dataframe
plot_bar(df_release, ggtheme = theme_minimal(base_size = 15), title = 'Release')## Frequency distribution beta dataframe
plot_bar(df_beta, ggtheme = theme_minimal(base_size = 15), title = 'Beta')## Frequency distribution release dataframe
plot_bar(df_v_release, ggtheme = theme_minimal(base_size = 15), title = 'Release')## Frequency distribution beta dataframe
plot_bar(df_v_beta, ggtheme = theme_minimal(base_size = 15), title = 'Beta')## View histogram of release dataset
plot_histogram(df_release, ggtheme = theme_minimal(base_size = 15), title = 'Release')## View histogram of beta dataset
plot_histogram(df_beta, ggtheme = theme_minimal(base_size = 15), title = 'Beta')## View histogram of release dataset
plot_histogram(df_release, ggtheme = theme_minimal(base_size = 15), title = 'Release')## View histogram of beta dataset
plot_histogram(df_beta, ggtheme = theme_minimal(base_size = 15), title = 'Beta')## Training
t <- ggplot(data=df_train_f, aes(x=uri_count, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) + xlim(0, 1000) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="URI Count", y = "Density") +
theme_ipsum()
## Validation
v <- ggplot(data=df_validate_f, aes(x=uri_count, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) + xlim(0, 1000) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="URI Count", y = "Density") +
theme_ipsum()
plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting space## Training
t <- ggplot(data=df_train_f, aes(x=active_hours, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="Active Hours", y = "Density") +
theme_ipsum()
## Validation
v <- ggplot(data=df_validate_f, aes(x=active_hours, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="Active Hours", y = "Density") +
theme_ipsum()
plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting space## Training
t <- ggplot(data=df_train_f, aes(x=num_pages, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) + xlim(0, 50000) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="Num Pages", y = "Density") +
theme_ipsum()
## Validation
v <- ggplot(data=df_validate_f, aes(x=num_pages, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) + xlim(0, 50000) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="Num Pages", y = "Density") +
theme_ipsum()
plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting space## Training
t <- ggplot(data=df_train_f, aes(x=session_length, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) + xlim(0, 75) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="Session Length", y = "Density") +
theme_ipsum()
## Validation
v <- ggplot(data=df_validate_f, aes(x=session_length, group=label, fill=label)) +
geom_density(adjust=1.5, alpha=0.6) + xlim(0, 75) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
labs(x="Session Length", y = "Density") +
theme_ipsum()
plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting spaceThis section will focus only on user engagement continuous metrics. So, we are going to analyze the following metrics:
num_active_daysactive_hoursactive_hours_maxuri_counturi_count_maxsession_lengthsession_length_maxsearch_countsearch_count_maxnum_bookmarksnum_pagesnum_pages_maxnum_addonsdaily_unique_domainsdaily_unique_domains_maxdaily_max_tabsdaily_max_tabs_maxdaily_tabs_openeddaily_tabs_opened_maxdaily_num_sessions_starteddaily_num_sessions_started_maxstartup_msinstall_yearprofile_agetimezone_offsetmemory_mbcpu_corescpu_speed_mhzcpu_l2_cache_kbkable(text_tbl) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| beta_num_active_days | release_num_active_days | beta_active_hours | release_active_hours | beta_active_hours_max | release_active_hours_max | beta_uri_count | release_uri_count | beta_uri_count_max | release_uri_count_max | beta_session_length | release_session_length | beta_session_length_max | release_session_length_max | beta_search_count | release_search_count | beta_search_count_max | release_search_count_max | beta_num_bookmarks | release_num_bookmarks | beta_num_pages | release_num_pages | beta_num_pages_max | release_num_pages_max | beta_daily_unique_domains | release_daily_unique_domains | beta_daily_max_tabs | release_daily_max_tabs | beta_daily_tabs_opened | release_daily_tabs_opened | beta_daily_num_sessions_started | release_daily_num_sessions_started | beta_daily_unique_domains_max | release_daily_unique_domains_max | beta_daily_max_tabs_max | release_daily_max_tabs_max | beta_daily_tabs_opened_max | release_daily_tabs_opened_max | beta_daily_num_sessions_started_max | release_daily_num_sessions_started_max | beta_startup_ms | release_startup_ms | beta_install_year | release_install_year | beta_profile_age | release_profile_age | beta_timezone_offset | release_timezone_offset | beta_memory_mb | release_memory_mb | beta_cpu_cores | release_cpu_cores | beta_cpu_speed_mhz | release_cpu_speed_mhz | beta_cpu_l2_cache_kb | release_cpu_l2_cache_kb | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. | 1.000000 | 1.000000 | 0.0000000 | 0.0000000 | 0.000000 | 0.0000000 | 1.00000 | 1.00000 | 1.0000 | 1.0000 | 0.0166665 | 0.0192595 | 0.021389 | 0.030556 | 0.0000000 | 0.000000 | 0.000000 | 0.000000 | 0.0000 | 0.0000 | 1.000 | 0.000 | 1.00 | 0.00 | 1.000000 | 1.000000 | 1.000000 | 0.625000 | 1.00000 | 1.000000 | 0.000000 | 0.000000 | 1.00000 | 1.000000 | 1.00000 | 1.000000 | 1.00000 | 1.00000 | 0.000000 | 0.000000 | 269.000 | 261.128 | 1993.000 | 2000.000 | 0.0000 | 0.0000 | -720.000 | -720.000 | 511.000 | 512.000 | 1.000000 | 1.000000 | 798.000 | 792.00 | 128.0000 | 128.0000 |
| 1st Qu. | 4.000000 | 4.000000 | 0.2250000 | 0.2686111 | 0.450000 | 0.5402778 | 37.00000 | 44.33333 | 68.0000 | 86.0000 | 2.5284028 | 2.1590431 | 4.877777 | 4.408542 | 0.0000000 | 0.000000 | 0.000000 | 0.000000 | 10.0000 | 10.0000 | 686.000 | 1022.125 | 785.00 | 1142.00 | 2.166667 | 2.283333 | 2.600000 | 2.500000 | 4.00000 | 4.000000 | 1.000000 | 1.250000 | 3.00000 | 3.125000 | 4.00000 | 4.000000 | 6.00000 | 7.00000 | 2.000000 | 2.000000 | 2102.206 | 1432.677 | 2017.000 | 2016.000 | 271.0000 | 257.0000 | -300.000 | -300.000 | 3984.000 | 4011.000 | 2.000000 | 2.000000 | 2200.000 | 2261.00 | 256.0000 | 256.0000 |
| Median | 6.000000 | 6.000000 | 0.5309524 | 0.5744444 | 1.063889 | 1.1541667 | 86.66667 | 96.66667 | 172.0000 | 196.0000 | 7.7105554 | 6.3351191 | 14.808889 | 11.701666 | 0.8333333 | 0.875000 | 2.000000 | 2.000000 | 26.0000 | 26.0000 | 4185.667 | 5536.000 | 4340.00 | 5705.50 | 3.562500 | 3.600000 | 4.250000 | 3.714286 | 9.00000 | 8.833333 | 1.666667 | 2.000000 | 5.50000 | 6.000000 | 6.00000 | 6.000000 | 17.00000 | 17.00000 | 3.000000 | 4.000000 | 5088.010 | 3231.339 | 2018.000 | 2018.000 | 711.0000 | 698.0000 | -240.000 | -240.000 | 8031.000 | 8069.000 | 2.000000 | 2.000000 | 2594.000 | 2712.00 | 256.0000 | 256.0000 |
| Mean | 5.346169 | 5.569842 | 0.8236611 | 0.8468557 | 1.577508 | 1.6251135 | 152.74550 | 156.24224 | 311.0213 | 321.3891 | 12.2961990 | 9.2821806 | 22.706568 | 18.210749 | 2.4506498 | 2.376504 | 5.636171 | 5.434352 | 242.4878 | 158.9390 | 17363.463 | 17330.600 | 17558.93 | 17518.75 | 5.060464 | 4.968328 | 9.603628 | 6.200080 | 20.49191 | 17.092979 | 2.368895 | 2.888602 | 8.74361 | 8.552061 | 13.81149 | 9.317556 | 39.64786 | 33.27059 | 4.281399 | 5.248573 | 25835.963 | 9832.051 | 2017.138 | 2017.064 | 893.7534 | 894.7365 | -143.855 | -238.714 | 8965.156 | 9443.657 | 2.975699 | 3.143089 | 2678.209 | 2710.62 | 679.9325 | 625.9611 |
| 3rd Qu. | 8.000000 | 8.000000 | 1.1028646 | 1.1265956 | 2.165278 | 2.1902778 | 188.50000 | 197.00000 | 382.0000 | 400.0000 | 19.5792560 | 13.6608531 | 31.519026 | 26.128403 | 2.8333333 | 3.000000 | 7.000000 | 7.000000 | 96.0000 | 85.2125 | 18605.464 | 19680.625 | 18885.50 | 19922.00 | 6.166667 | 6.070833 | 8.000000 | 6.000000 | 21.75000 | 19.166667 | 2.875000 | 3.500000 | 11.00000 | 11.000000 | 12.00000 | 9.000000 | 42.00000 | 38.00000 | 5.000000 | 6.000000 | 12618.764 | 8394.891 | 2018.000 | 2018.000 | 1354.0000 | 1374.0000 | 60.000 | -240.000 | 10238.000 | 12144.000 | 4.000000 | 4.000000 | 3192.000 | 3193.00 | 512.0000 | 512.0000 |
| Max. | 8.000000 | 8.000000 | 7.2901042 | 7.1222222 | 24.983333 | 23.9666667 | 2931.00000 | 2391.25000 | 15626.0000 | 18032.0000 | 240.8048605 | 91.0663890 | 1255.382223 | 384.288333 | 51.0000000 | 45.750000 | 188.000000 | 217.000000 | 40401.0000 | 18632.0000 | 179657.500 | 168416.286 | 180456.00 | 172543.00 | 44.000000 | 39.375000 | 1012.625000 | 445.375000 | 518.25000 | 347.500000 | 32.833333 | 32.250000 | 100.00000 | 100.000000 | 3149.00000 | 2425.000000 | 3302.00000 | 2410.00000 | 88.000000 | 100.000000 | 17109505.514 | 5358122.833 | 2019.000 | 2019.000 | 7051.0000 | 6922.0000 | 840.000 | 720.000 | 262078.000 | 524254.000 | 36.000000 | 40.000000 | 37214.000 | 15077.00 | 6144.0000 | 6144.0000 |
kable(text_tbl_v) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| beta_num_active_days | release_num_active_days | beta_active_hours | release_active_hours | beta_active_hours_max | release_active_hours_max | beta_uri_count | release_uri_count | beta_uri_count_max | release_uri_count_max | beta_session_length | release_session_length | beta_session_length_max | release_session_length_max | beta_search_count | release_search_count | beta_search_count_max | release_search_count_max | beta_num_bookmarks | release_num_bookmarks | beta_num_pages | release_num_pages | beta_num_pages_max | release_num_pages_max | beta_daily_unique_domains | release_daily_unique_domains | beta_daily_max_tabs | release_daily_max_tabs | beta_daily_tabs_opened | release_daily_tabs_opened | beta_daily_num_sessions_started | release_daily_num_sessions_started | beta_daily_unique_domains_max | release_daily_unique_domains_max | beta_daily_max_tabs_max | release_daily_max_tabs_max | beta_daily_tabs_opened_max | release_daily_tabs_opened_max | beta_daily_num_sessions_started_max | release_daily_num_sessions_started_max | beta_startup_ms | release_startup_ms | beta_install_year | release_install_year | beta_profile_age | release_profile_age | beta_timezone_offset | release_timezone_offset | beta_memory_mb | release_memory_mb | beta_cpu_cores | release_cpu_cores | beta_cpu_speed_mhz | release_cpu_speed_mhz | beta_cpu_l2_cache_kb | release_cpu_l2_cache_kb | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. | 1.000000 | 1.000000 | 0.0000000 | 0.0000000 | 0.0000000 | 0.000000 | 1.0000 | 1.00000 | 1.0000 | 1.0000 | 0.0199998 | 0.0157222 | 0.041111 | 0.019722 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0000 | 0.00000 | 0.0000 | 0.0000 | 0.00 | 0.00 | 1.000000 | 1.000000 | 0.400000 | 0.5714286 | 1.00000 | 1.000000 | 0.000000 | 0.000000 | 1.00000 | 1.000000 | 1.00000 | 1.000000 | 1.00000 | 1.00000 | 0.000000 | 0.000000 | 289.000 | 2.388333e+02 | 2000.000 | 2000.000 | 0.0000 | 0.000 | -720.0000 | -720.0000 | 511.000 | 512.000 | 1.000000 | 1.000000 | 633.00 | 768.000 | 128.0000 | 128.0000 |
| 1st Qu. | 3.000000 | 4.000000 | 0.2074074 | 0.2640873 | 0.3916667 | 0.537500 | 33.2500 | 44.00000 | 57.0000 | 86.0000 | 2.2133335 | 2.3199533 | 4.002778 | 4.788333 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.0000 | 10.00000 | 543.3333 | 991.3333 | 620.00 | 1113.00 | 2.125000 | 2.287037 | 2.500000 | 2.5000000 | 3.50000 | 4.000000 | 1.000000 | 1.166667 | 3.00000 | 3.200000 | 4.00000 | 4.000000 | 5.00000 | 7.00000 | 1.000000 | 2.000000 | 2218.863 | 1.567167e+03 | 2017.000 | 2017.000 | 213.0000 | 235.000 | -300.0000 | -300.0000 | 3981.000 | 4021.000 | 2.000000 | 2.000000 | 2195.00 | 2261.000 | 256.0000 | 256.0000 |
| Median | 5.000000 | 6.000000 | 0.5027778 | 0.5751736 | 0.9625000 | 1.165278 | 80.5000 | 97.42857 | 152.0000 | 199.0000 | 7.1543749 | 6.7808331 | 12.961389 | 12.802778 | 0.750000 | 1.000000 | 2.000000 | 3.000000 | 23.0000 | 25.33333 | 3347.5000 | 5308.0000 | 3513.00 | 5490.00 | 3.500000 | 3.651190 | 4.125000 | 3.8000000 | 8.50000 | 8.857143 | 1.666667 | 2.000000 | 5.20000 | 6.000000 | 6.00000 | 6.000000 | 15.00000 | 17.00000 | 3.000000 | 4.000000 | 5015.522 | 3.345629e+03 | 2018.000 | 2018.000 | 690.0000 | 673.000 | -240.0000 | -240.0000 | 7973.000 | 8073.000 | 2.000000 | 3.000000 | 2594.00 | 2712.000 | 256.0000 | 256.0000 |
| Mean | 4.912574 | 5.710307 | 0.7988445 | 0.8524956 | 1.4711888 | 1.636961 | 146.3339 | 158.71931 | 287.3003 | 328.3892 | 12.3367205 | 9.7067904 | 22.272966 | 18.614122 | 2.324319 | 2.446479 | 5.100206 | 5.633558 | 225.4153 | 158.03362 | 15614.0379 | 17089.9304 | 15779.79 | 17289.08 | 5.148112 | 5.112258 | 9.019717 | 6.3471824 | 20.03166 | 17.187064 | 2.398131 | 2.831202 | 8.58199 | 8.837358 | 12.82845 | 9.539785 | 37.29553 | 33.54423 | 4.141417 | 5.180103 | 50072.589 | 2.727928e+04 | 2017.255 | 2017.194 | 875.2575 | 883.857 | -129.9082 | -240.4712 | 8795.994 | 9719.802 | 2.954155 | 3.191904 | 2656.31 | 2712.603 | 674.5779 | 610.7777 |
| 3rd Qu. | 7.000000 | 8.000000 | 1.0600694 | 1.1399306 | 2.0013889 | 2.220833 | 179.0000 | 200.14286 | 352.0000 | 409.0000 | 18.7390970 | 14.7740975 | 28.951388 | 27.097500 | 2.666667 | 3.000000 | 6.000000 | 7.000000 | 85.0000 | 84.00000 | 15660.6250 | 19359.8333 | 15872.00 | 19606.00 | 6.166667 | 6.250000 | 7.750000 | 6.2857143 | 21.00000 | 19.500000 | 3.000000 | 3.375000 | 10.50000 | 11.000000 | 11.00000 | 10.000000 | 39.00000 | 38.00000 | 5.000000 | 6.000000 | 11210.000 | 7.770437e+03 | 2019.000 | 2019.000 | 1329.0000 | 1368.000 | 60.0000 | -240.0000 | 8189.000 | 12180.000 | 4.000000 | 4.000000 | 3192.00 | 3193.000 | 512.0000 | 512.0000 |
| Max. | 8.000000 | 8.000000 | 7.5402778 | 7.2204861 | 31.1277778 | 25.440278 | 2983.0000 | 2483.16667 | 17548.0000 | 18524.0000 | 286.6983330 | 90.4422220 | 922.285000 | 524.545556 | 50.000000 | 45.000000 | 313.000000 | 208.000000 | 39519.0000 | 20002.14286 | 177583.0000 | 168812.1429 | 182555.00 | 170532.00 | 49.291667 | 42.400000 | 910.400000 | 449.3333333 | 554.00000 | 357.000000 | 32.000000 | 32.250000 | 100.00000 | 100.000000 | 1779.00000 | 2215.000000 | 2551.00000 | 2342.00000 | 110.000000 | 184.000000 | 50660199.500 | 2.259481e+07 | 2019.000 | 2019.000 | 7095.0000 | 6972.000 | 840.0000 | 780.0000 | 294902.000 | 1572801.000 | 32.000000 | 50.000000 | 37221.00 | 28900.000 | 6144.0000 | 6144.0000 |
par(mfrow = c(4, 2)) ## Set up a 2 x 3 plotting space
## QQ plot in R to compare two data samples
for (i in user_eng) {
# Training
x_t <- df_beta_ue[,i]
y_t <- df_release_ue[,i]
rg_t <- range(x_t, y_t, na.rm=T)
test_t <- ks.test(x_t, y_t)$statistic
test_t <- paste("KS Test = ", round(test_t, 3))
# Validation
x_v <- df_beta_v_ue[,i]
y_v <- df_release_v_ue[,i]
rg_v <- range(x_v, y_v, na.rm=T)
test_v <- ks.test(x_v, y_v)$statistic
test_v <- paste("KS Test = ", round(test_v, 3))
########
title_t <- paste('V67', i, sep='\n')
qqplot(x_t, y_t, main=title_t, xlim=rg_t, ylim=rg_t, xlab = "Beta", ylab = "Release", pch = 1)
# mtext(test, side=3)
text(min(x_t), (if(max(x_t) > max(y_t)) max(x_t) else max(y_t)), test_t, adj=c(0,1))
abline(0,1, col="#fe346e", lty=2)
title_v <- paste('V68', i, sep='\n')
qqplot(x_v, y_v, main=title_v, xlim=rg_v, ylim=rg_v, xlab = "Beta", ylab = "Release", pch = 1)
# mtext(test, side=3)
text(min(x_v), (if(max(x_v) > max(y_v)) max(x_v) else max(y_v)), test_v, adj=c(0,1))
abline(0,1, col="#fe346e", lty=2)
}kable(tbl_ks) %>%
add_header_above(c("KS Distance" = 3)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
row_spec(c(2,18,21:26), bold = T, color = "white", background = "#c3f584")| training | validation | |
|---|---|---|
| num_active_days | 0.071 | 0.166 |
| active_hours | 0.045 | 0.058 |
| active_hours_max | 0.044 | 0.077 |
| uri_count | 0.047 | 0.071 |
| uri_count_max | 0.054 | 0.095 |
| session_length | 0.099 | 0.076 |
| session_length_max | 0.073 | 0.040 |
| search_count | 0.014 | 0.045 |
| search_count_max | 0.019 | 0.059 |
| num_bookmarks | 0.026 | 0.042 |
| num_pages | 0.055 | 0.082 |
| num_pages_max | 0.054 | 0.082 |
| num_addons | 0.601 | 0.292 |
| daily_unique_domains | 0.029 | 0.040 |
| daily_unique_domains_max | 0.033 | 0.054 |
| daily_max_tabs | 0.094 | 0.075 |
| daily_max_tabs_max | 0.080 | 0.051 |
| daily_tabs_opened | 0.035 | 0.036 |
| daily_tabs_opened_max | 0.032 | 0.053 |
| daily_num_sessions_started | 0.119 | 0.098 |
| daily_num_sessions_started_max | 0.116 | 0.117 |
| startup_ms | 0.129 | 0.130 |
| install_year | 0.018 | 0.017 |
| profile_age | 0.042 | 0.035 |
| timezone_offset | 0.208 | 0.235 |
| memory_mb | 0.075 | 0.109 |
| cpu_cores | 0.064 | 0.087 |
| cpu_speed_mhz | 0.046 | 0.062 |
| cpu_l2_cache_kb | 0.025 | 0.035 |
The following violin plots depicts distributions for the beta and release subsets, for both versions v67 and v68. Violin plot is a powerful data visualization technique since it allows to compare both the ranking of several groups and their distribution.
NOTE: Guiding lines have been added for the following:
## Violin plots
for (i in user_eng) {
df_validate_means <- df_validate_ue %>%
group_by(label) %>%
summarise(value = mean(eval(as.name(i))))
df_validate_medians <- df_validate_ue %>%
group_by(label) %>%
summarise(value = median(eval(as.name(i))))
df_train_means <- df_train_ue %>%
group_by(label) %>%
summarise(value = mean(eval(as.name(i))))
df_train_medians <- df_train_ue %>%
group_by(label) %>%
summarise(value = median(eval(as.name(i))))
###########
violin_train <- ggplot(df_train_ue, aes(x=label, y=eval(as.name(i)), fill=label)) +
geom_violin(trim=FALSE) +
labs(title=i,x="Channel", y = "Measure") +
scale_fill_manual(values=c("#111d5e", "#b21f66")) +
geom_hline(
data = df_train_means,
aes(yintercept = value, colour = label),
linetype = "dashed",
size = 1
) +
geom_hline(
data = df_train_medians,
aes(yintercept = value, colour = label),
linetype = "solid",
size = 1
) + scale_colour_manual(values=c("blue", "red")) +
geom_boxplot(width=0.1, fill="white", alpha=0.5) +
theme_minimal() + theme(legend.position="none") + coord_flip()
violin_valid <- ggplot(df_validate_ue, aes(x=label, y=eval(as.name(i)), fill=label)) +
geom_violin(trim=FALSE) +
labs(title=i,x="Channel", y = "Measure") +
scale_fill_manual(values=c("#111d5e", "#b21f66")) +
geom_hline(
data = df_validate_means,
aes(yintercept = value, colour = label),
linetype = "dashed",
size = 1
) +
geom_hline(
data = df_validate_medians,
aes(yintercept = value, colour = label),
linetype = "solid",
size = 1
) + scale_colour_manual(values=c("blue", "red")) +
geom_boxplot(width=0.1, fill="white", alpha=0.5) +
theme_minimal() + theme(legend.position="none") + coord_flip()
print(plot_grid(violin_train, violin_valid, ncol = 1,labels = c('V67','V68')))
}The violin plots above show the relationship of channel type to user engagement metrics. Overall, the results were quite similar comparing the two versions (v67 and v68). Only the following metrics yielded different results:
active_hours_maxdaily_unique_domains_maxNow, let’s take a closer look at the similar comparative results:
Which variables presented similar/equal means and medians for beta and release users?
uri_counturi_count_maxsession_length_maxsearch_countsearch_count_maxdaily_unique_domainsdaily_tabs_opened_maxinstall_yearprofile_agememory_mbcpu_speed_mhzWhich variables presented higher release users means and/or medians than betas?
num_active_daysactive_hoursnum_pagesnum_pages_maxdaily_num_sessions_starteddaily_num_sessions_started_maxcpu_coresWhich variables presented lower release users means and/or medians than betas?
session_lengthnum_addonsdaily_tabs_openedtimezone_offsetcpu_l2_cache_kbFour variables became difficult to visualize through the violin plot. So we will plot ridgeline charts.
vlist <- c('num_bookmarks','daily_max_tabs','daily_max_tabs_max','startup_ms')
## Ridgeline plots
for (i in vlist) {
ridge_train <- ggplot(df_train_ue, aes(x = eval(as.name(i)), y = label, fill = label)) +
geom_density_ridges(alpha=0.6, stat="binline", bins=20) +
labs(title=i,x="Channel", y = "Measure") +
theme_ridges() +
scale_fill_manual(values=c("#111d5e", "#b21f66")) +
theme(legend.position = "none")
ridge_valid <- ggplot(df_validate_ue, aes(x = eval(as.name(i)), y = label, fill = label)) +
geom_density_ridges(alpha=0.6, stat="binline", bins=20) +
labs(title=i,x="Channel", y = "Measure") +
theme_ridges() +
scale_fill_manual(values=c("#111d5e", "#b21f66")) +
theme(legend.position = "none")
print(plot_grid(ridge_train, ridge_valid, ncol = 1,labels = c('V67','V68')))
}This section will focus only on user engagement discrete metrics. So, we are going to analyze the following metrics:
default_search_engineis_default_browserprofile_age_catdistro_id_normmemory_catcpu_speed_catcpu_cores_catcpu_l2_cache_kb_catcpu_vendoros_versionis_wow64fxa_configuredsync_configuredlocalecountrytimezone_catlabelnormalized_channelis_releasepar(mfrow = c(2, 2)) ## Set up a 2 x 2 plotting space
## QQ plot in R to compare two data samples
for (i in user_eng_dis) {
x <- df_beta_ue_dis[,i]
y <- df_release_ue_dis[,i]
rel_beta <- table(x)/nrow(df_beta_ue_dis) #divide the frequency counts by the total
beta_bar <- barplot(rel_beta,
main = "Beta", #Give your chart a title
ylim=c(0,1), border=F, col = "#111d5e",
xlab = i, #Label the x axis
ylab = "Relative Frequency" #Label the y axis
)
# Add the text
text(beta_bar, rel_beta+0.025, paste(round(rel_beta*100), "%", sep="") ,cex=1)
rel_release <- table(y)/nrow(df_release_ue_dis) #divide the frequency counts by the total
release_bar <- barplot(rel_release,
main = "Release", #Give your chart a title
ylim=c(0,1), border=F, col = "#b21f66",
xlab = i, #Label the x axis
ylab = "Relative Frequency" #Label the y axis
)
# Add the text
text(release_bar, rel_release+0.025, paste(round(rel_release*100), "%", sep="") ,cex=1)
}par(mfrow = c(2, 2)) ## Set up a 2 x 2 plotting space
## QQ plot in R to compare two data samples
for (i in user_eng_dis) {
x <- df_beta_v_ue_dis[,i]
y <- df_release_v_ue_dis[,i]
rel_beta <- table(x)/nrow(df_beta_v_ue_dis) #divide the frequency counts by the total
beta_bar <- barplot(rel_beta,
main = "Beta", #Give your chart a title
ylim=c(0,1), border=F, col = "#111d5e",
xlab = i, #Label the x axis
ylab = "Relative Frequency" #Label the y axis
)
# Add the text
text(beta_bar, rel_beta+0.025, paste(round(rel_beta*100), "%", sep="") ,cex=1)
rel_release <- table(y)/nrow(df_release_v_ue_dis) #divide the frequency counts by the total
release_bar <- barplot(rel_release,
main = "Release", #Give your chart a title
ylim=c(0,1), border=F, col = "#b21f66",
xlab = i, #Label the x axis
ylab = "Relative Frequency" #Label the y axis
)
# Add the text
text(release_bar, rel_release+0.025, paste(round(rel_release*100), "%", sep="") ,cex=1)
}